{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def load_label(fname, split=False, rate=None):\n",
    "    f = open(fname)\n",
    "    cnt, idx, tidx = 0, [], []\n",
    "    for line in f:\n",
    "        cnt += 1\n",
    "        if cnt % 10000 == 0:\n",
    "            print cnt\n",
    "        terms = line.strip().split('\\t')\n",
    "        idx.append(terms[0])\n",
    "        if len(terms) == 2:\n",
    "            tidx.append(terms[1])\n",
    "    if split:\n",
    "        ids = np.arange(cnt)\n",
    "        np.random.seed(1024)\n",
    "        np.random.shuffle(ids)\n",
    "        train_id = ids[:int(cnt*rate)]\n",
    "        val_id = ids[int(cnt*rate):]\n",
    "        return [tidx[i] for i in train_id], [tidx[i] for i in val_id]\n",
    "    return tidx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def getLabelid(data, topic_dict):\n",
    "    data_idx = []\n",
    "    for item in data:\n",
    "        item_arr = [ii for ii in item.split(',') if ii != '']\n",
    "        ex = np.zeros(len(item_arr))\n",
    "        for i, topic in enumerate(item_arr):\n",
    "            idx = topic_dict.get(topic)\n",
    "            ex[i] = idx\n",
    "        data_idx.append(ex.astype('int32'))\n",
    "    return data_idx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10000\n",
      "20000\n",
      "30000\n",
      "40000\n",
      "50000\n",
      "60000\n",
      "70000\n",
      "80000\n",
      "90000\n",
      "100000\n",
      "110000\n",
      "120000\n",
      "130000\n",
      "140000\n",
      "150000\n",
      "160000\n",
      "170000\n",
      "180000\n",
      "190000\n",
      "200000\n",
      "210000\n",
      "220000\n",
      "230000\n",
      "240000\n",
      "250000\n",
      "260000\n",
      "270000\n",
      "280000\n",
      "290000\n",
      "300000\n",
      "310000\n",
      "320000\n",
      "330000\n",
      "340000\n",
      "350000\n",
      "360000\n",
      "370000\n",
      "380000\n",
      "390000\n",
      "400000\n",
      "410000\n",
      "420000\n",
      "430000\n",
      "440000\n",
      "450000\n",
      "460000\n",
      "470000\n",
      "480000\n",
      "490000\n",
      "500000\n",
      "510000\n",
      "520000\n",
      "530000\n",
      "540000\n",
      "550000\n",
      "560000\n",
      "570000\n",
      "580000\n",
      "590000\n",
      "600000\n",
      "610000\n",
      "620000\n",
      "630000\n",
      "640000\n",
      "650000\n",
      "660000\n",
      "670000\n",
      "680000\n",
      "690000\n",
      "700000\n",
      "710000\n",
      "720000\n",
      "730000\n",
      "740000\n",
      "750000\n",
      "760000\n",
      "770000\n",
      "780000\n",
      "790000\n",
      "800000\n",
      "810000\n",
      "820000\n",
      "830000\n",
      "840000\n",
      "850000\n",
      "860000\n",
      "870000\n",
      "880000\n",
      "890000\n",
      "900000\n",
      "910000\n",
      "920000\n",
      "930000\n",
      "940000\n",
      "950000\n",
      "960000\n",
      "970000\n",
      "980000\n",
      "990000\n",
      "1000000\n",
      "1010000\n",
      "1020000\n",
      "1030000\n",
      "1040000\n",
      "1050000\n",
      "1060000\n",
      "1070000\n",
      "1080000\n",
      "1090000\n",
      "1100000\n",
      "1110000\n",
      "1120000\n",
      "1130000\n",
      "1140000\n",
      "1150000\n",
      "1160000\n",
      "1170000\n",
      "1180000\n",
      "1190000\n",
      "1200000\n",
      "1210000\n",
      "1220000\n",
      "1230000\n",
      "1240000\n",
      "1250000\n",
      "1260000\n",
      "1270000\n",
      "1280000\n",
      "1290000\n",
      "1300000\n",
      "1310000\n",
      "1320000\n",
      "1330000\n",
      "1340000\n",
      "1350000\n",
      "1360000\n",
      "1370000\n",
      "1380000\n",
      "1390000\n",
      "1400000\n",
      "1410000\n",
      "1420000\n",
      "1430000\n",
      "1440000\n",
      "1450000\n",
      "1460000\n",
      "1470000\n",
      "1480000\n",
      "1490000\n",
      "1500000\n",
      "1510000\n",
      "1520000\n",
      "1530000\n",
      "1540000\n",
      "1550000\n",
      "1560000\n",
      "1570000\n",
      "1580000\n",
      "1590000\n",
      "1600000\n",
      "1610000\n",
      "1620000\n",
      "1630000\n",
      "1640000\n",
      "1650000\n",
      "1660000\n",
      "1670000\n",
      "1680000\n",
      "1690000\n",
      "1700000\n",
      "1710000\n",
      "1720000\n",
      "1730000\n",
      "1740000\n",
      "1750000\n",
      "1760000\n",
      "1770000\n",
      "1780000\n",
      "1790000\n",
      "1800000\n",
      "1810000\n",
      "1820000\n",
      "1830000\n",
      "1840000\n",
      "1850000\n",
      "1860000\n",
      "1870000\n",
      "1880000\n",
      "1890000\n",
      "1900000\n",
      "1910000\n",
      "1920000\n",
      "1930000\n",
      "1940000\n",
      "1950000\n",
      "1960000\n",
      "1970000\n",
      "1980000\n",
      "1990000\n",
      "2000000\n",
      "2010000\n",
      "2020000\n",
      "2030000\n",
      "2040000\n",
      "2050000\n",
      "2060000\n",
      "2070000\n",
      "2080000\n",
      "2090000\n",
      "2100000\n",
      "2110000\n",
      "2120000\n",
      "2130000\n",
      "2140000\n",
      "2150000\n",
      "2160000\n",
      "2170000\n",
      "2180000\n",
      "2190000\n",
      "2200000\n",
      "2210000\n",
      "2220000\n",
      "2230000\n",
      "2240000\n",
      "2250000\n",
      "2260000\n",
      "2270000\n",
      "2280000\n",
      "2290000\n",
      "2300000\n",
      "2310000\n",
      "2320000\n",
      "2330000\n",
      "2340000\n",
      "2350000\n",
      "2360000\n",
      "2370000\n",
      "2380000\n",
      "2390000\n",
      "2400000\n",
      "2410000\n",
      "2420000\n",
      "2430000\n",
      "2440000\n",
      "2450000\n",
      "2460000\n",
      "2470000\n",
      "2480000\n",
      "2490000\n",
      "2500000\n",
      "2510000\n",
      "2520000\n",
      "2530000\n",
      "2540000\n",
      "2550000\n",
      "2560000\n",
      "2570000\n",
      "2580000\n",
      "2590000\n",
      "2600000\n",
      "2610000\n",
      "2620000\n",
      "2630000\n",
      "2640000\n",
      "2650000\n",
      "2660000\n",
      "2670000\n",
      "2680000\n",
      "2690000\n",
      "2700000\n",
      "2710000\n",
      "2720000\n",
      "2730000\n",
      "2740000\n",
      "2750000\n",
      "2760000\n",
      "2770000\n",
      "2780000\n",
      "2790000\n",
      "2800000\n",
      "2810000\n",
      "2820000\n",
      "2830000\n",
      "2840000\n",
      "2850000\n",
      "2860000\n",
      "2870000\n",
      "2880000\n",
      "2890000\n",
      "2900000\n",
      "2910000\n",
      "2920000\n",
      "2930000\n",
      "2940000\n",
      "2950000\n",
      "2960000\n",
      "2970000\n",
      "2980000\n",
      "2990000\n",
      "10000\n",
      "20000\n",
      "30000\n",
      "40000\n",
      "50000\n",
      "60000\n",
      "70000\n",
      "80000\n",
      "90000\n",
      "100000\n",
      "110000\n",
      "120000\n",
      "130000\n",
      "140000\n",
      "150000\n",
      "160000\n",
      "170000\n",
      "180000\n",
      "190000\n",
      "200000\n",
      "210000\n",
      "220000\n",
      "230000\n",
      "240000\n",
      "250000\n",
      "260000\n",
      "270000\n",
      "280000\n",
      "290000\n",
      "300000\n",
      "310000\n",
      "320000\n",
      "330000\n",
      "340000\n",
      "350000\n",
      "360000\n",
      "370000\n",
      "380000\n",
      "390000\n",
      "400000\n",
      "410000\n",
      "420000\n",
      "430000\n",
      "440000\n",
      "450000\n",
      "460000\n",
      "470000\n",
      "480000\n",
      "490000\n",
      "500000\n",
      "510000\n",
      "520000\n",
      "530000\n",
      "540000\n",
      "550000\n",
      "560000\n",
      "570000\n",
      "580000\n",
      "590000\n",
      "600000\n",
      "610000\n",
      "620000\n",
      "630000\n",
      "640000\n",
      "650000\n",
      "660000\n",
      "670000\n",
      "680000\n",
      "690000\n",
      "700000\n",
      "710000\n",
      "720000\n",
      "730000\n",
      "740000\n",
      "750000\n",
      "760000\n",
      "770000\n",
      "780000\n",
      "790000\n",
      "800000\n",
      "810000\n",
      "820000\n",
      "830000\n",
      "840000\n",
      "850000\n",
      "860000\n",
      "870000\n",
      "880000\n",
      "890000\n",
      "900000\n",
      "910000\n",
      "920000\n",
      "930000\n",
      "940000\n",
      "950000\n",
      "960000\n",
      "970000\n",
      "980000\n",
      "990000\n",
      "1000000\n",
      "1010000\n",
      "1020000\n",
      "1030000\n",
      "1040000\n",
      "1050000\n",
      "1060000\n",
      "1070000\n",
      "1080000\n",
      "1090000\n",
      "1100000\n",
      "1110000\n",
      "1120000\n",
      "1130000\n",
      "1140000\n",
      "1150000\n",
      "1160000\n",
      "1170000\n",
      "1180000\n",
      "1190000\n",
      "1200000\n",
      "1210000\n",
      "1220000\n",
      "1230000\n",
      "1240000\n",
      "1250000\n",
      "1260000\n",
      "1270000\n",
      "1280000\n",
      "1290000\n",
      "1300000\n",
      "1310000\n",
      "1320000\n",
      "1330000\n",
      "1340000\n",
      "1350000\n",
      "1360000\n",
      "1370000\n",
      "1380000\n",
      "1390000\n",
      "1400000\n",
      "1410000\n",
      "1420000\n",
      "1430000\n",
      "1440000\n",
      "1450000\n",
      "1460000\n",
      "1470000\n",
      "1480000\n",
      "1490000\n",
      "1500000\n",
      "1510000\n",
      "1520000\n",
      "1530000\n",
      "1540000\n",
      "1550000\n",
      "1560000\n",
      "1570000\n",
      "1580000\n",
      "1590000\n",
      "1600000\n",
      "1610000\n",
      "1620000\n",
      "1630000\n",
      "1640000\n",
      "1650000\n",
      "1660000\n",
      "1670000\n",
      "1680000\n",
      "1690000\n",
      "1700000\n",
      "1710000\n",
      "1720000\n",
      "1730000\n",
      "1740000\n",
      "1750000\n",
      "1760000\n",
      "1770000\n",
      "1780000\n",
      "1790000\n",
      "1800000\n",
      "1810000\n",
      "1820000\n",
      "1830000\n",
      "1840000\n",
      "1850000\n",
      "1860000\n",
      "1870000\n",
      "1880000\n",
      "1890000\n",
      "1900000\n",
      "1910000\n",
      "1920000\n",
      "1930000\n",
      "1940000\n",
      "1950000\n",
      "1960000\n",
      "1970000\n",
      "1980000\n",
      "1990000\n",
      "2000000\n",
      "2010000\n",
      "2020000\n",
      "2030000\n",
      "2040000\n",
      "2050000\n",
      "2060000\n",
      "2070000\n",
      "2080000\n",
      "2090000\n",
      "2100000\n",
      "2110000\n",
      "2120000\n",
      "2130000\n",
      "2140000\n",
      "2150000\n",
      "2160000\n",
      "2170000\n",
      "2180000\n",
      "2190000\n",
      "2200000\n",
      "2210000\n",
      "2220000\n",
      "2230000\n",
      "2240000\n",
      "2250000\n",
      "2260000\n",
      "2270000\n",
      "2280000\n",
      "2290000\n",
      "2300000\n",
      "2310000\n",
      "2320000\n",
      "2330000\n",
      "2340000\n",
      "2350000\n",
      "2360000\n",
      "2370000\n",
      "2380000\n",
      "2390000\n",
      "2400000\n",
      "2410000\n",
      "2420000\n",
      "2430000\n",
      "2440000\n",
      "2450000\n",
      "2460000\n",
      "2470000\n",
      "2480000\n",
      "2490000\n",
      "2500000\n",
      "2510000\n",
      "2520000\n",
      "2530000\n",
      "2540000\n",
      "2550000\n",
      "2560000\n",
      "2570000\n",
      "2580000\n",
      "2590000\n",
      "2600000\n",
      "2610000\n",
      "2620000\n",
      "2630000\n",
      "2640000\n",
      "2650000\n",
      "2660000\n",
      "2670000\n",
      "2680000\n",
      "2690000\n",
      "2700000\n",
      "2710000\n",
      "2720000\n",
      "2730000\n",
      "2740000\n",
      "2750000\n",
      "2760000\n",
      "2770000\n",
      "2780000\n",
      "2790000\n",
      "2800000\n",
      "2810000\n",
      "2820000\n",
      "2830000\n",
      "2840000\n",
      "2850000\n",
      "2860000\n",
      "2870000\n",
      "2880000\n",
      "2890000\n",
      "2900000\n",
      "2910000\n",
      "2920000\n",
      "2930000\n",
      "2940000\n",
      "2950000\n",
      "2960000\n",
      "2970000\n",
      "2980000\n",
      "2990000\n"
     ]
    }
   ],
   "source": [
    "label_file = '../ieee_zhihu_cup/question_topic_train_set.txt'\n",
    "train_label_idx, val_label_idx = load_label(label_file, split=True, rate=0.9)\n",
    "train_label_idx_all = load_label(label_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "topic_idx = list(np.load('../data_preprocess/topic/topic_idx.npy'))\n",
    "topic_idx_dict = {tid:i for i, tid in enumerate(topic_idx)}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Get train label finished!\n",
      "Get val label finished!\n",
      "Get train label all finished!\n"
     ]
    }
   ],
   "source": [
    "train_label = getLabelid(train_label_idx, topic_idx_dict)\n",
    "print 'Get train label finished!'\n",
    "val_label = getLabelid(val_label_idx, topic_idx_dict)\n",
    "print 'Get val label finished!'\n",
    "train_label_all = getLabelid(train_label_idx_all, topic_idx_dict)\n",
    "print 'Get train label all finished!'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Save train label finished!\n",
      "Save val label finished!\n",
      "Save train label all finished!\n"
     ]
    }
   ],
   "source": [
    "basedir = './train'\n",
    "np.save('{}/train_label_indices.npy'.format(basedir), train_label)\n",
    "print 'Save train label finished!'\n",
    "basedir = './val'\n",
    "np.save('{}/val_label_indices.npy'.format(basedir), val_label)\n",
    "print 'Save val label finished!'\n",
    "basedir = './train_all'\n",
    "np.save('{}/train_label_indices_all.npy'.format(basedir), train_label_all)\n",
    "print 'Save train label all finished!'"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
